Atlas output files are stored in the Example folder in the GitHub repository metagenome-atlas/Tutorial.

If you don’t already have a copy of the GitHub repository locally, you can run the following command from a Terminal. Note that RStudio includes a Terminal tab next to the Console tab. In the default configuration of RStudio, this panel is located in the bottom left hand corner.

git clone https://github.com/metagenome-atlas/Tutorial.git
library(dplyr)
library(readr)
library(stringr)
library(tidyr)
library(tibble)
library(ggplot2)
library(plotly)
library(pheatmap)
library(microbiome)
library(ape)
library(vegan)
library(useful)
library(kableExtra)
#library(RColorBrewer)

Metagenome assembled genomes

Taxonomy

Tax <- read_tsv("Example/Results/taxonomy.tsv")
kable(Tax)
user_genome kindom phylum class order family genus species
MAG11 Bacteria Firmicutes Bacilli Lactobacillales Lactobacillaceae Limosilactobacillus Limosilactobacillus reuteri
MAG30 Bacteria Firmicutes Bacilli Lactobacillales Lactobacillaceae Ligilactobacillus Ligilactobacillus murinus
MAG08 Bacteria Firmicutes Bacilli Lactobacillales Lactobacillaceae Lactobacillus Lactobacillus johnsonii
MAG40 Bacteria Firmicutes Bacilli Erysipelotrichales Erysipelotrichaceae Faecalibaculum Faecalibaculum rodentium
MAG42 Bacteria Firmicutes Bacilli Erysipelotrichales Erysipelatoclostridiaceae Erysipelatoclostridium Erysipelatoclostridium cocleatum
MAG20 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Kineothrix Kineothrix sp000403275
MAG03 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae CAG-95 CAG-95 sp000403495
MAG33 Bacteria Actinobacteriota Coriobacteriia Coriobacteriales Atopobiaceae NM07-P-09 NM07-P-09 sp004793925
MAG39 Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Burkholderiaceae Parasutterella Parasutterella sp900552195
MAG49 Bacteria Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides intestinalis
MAG45 Bacteria Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Phocaeicola Phocaeicola vulgatus
MAG58 Bacteria Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Phocaeicola Phocaeicola sp002493165
MAG10 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae UBA7173 UBA7173 sp001689485
MAG35 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae UBA7173 UBA7173 sp001915385
MAG48 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae Paramuribaculum Paramuribaculum sp900553585
MAG02 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae CAG-485 CAG-485 sp002361155
MAG56 Bacteria Bacteroidota Bacteroidia Bacteroidales Tannerellaceae Parabacteroides Parabacteroides goldsteinii
MAG38 Bacteria Bacteroidota Bacteroidia Bacteroidales UBA932 RC9 RC9 sp002298075
MAG44 Bacteria Bacteroidota Bacteroidia Bacteroidales Rikenellaceae Alistipes Alistipes sp002428825
MAG24 Bacteria Bacteroidota Bacteroidia Bacteroidales Rikenellaceae Alistipes Alistipes sp002362235
MAG16 Bacteria Verrucomicrobiota Verrucomicrobiae Verrucomicrobiales Akkermansiaceae Akkermansia Akkermansia muciniphila
MAG51 Bacteria Deferribacterota Deferribacteres Deferribacterales Mucispirillaceae Mucispirillum Mucispirillum schaedleri
MAG12 Bacteria Firmicutes_A Clostridia Oscillospirales Acutalibacteraceae Eubacterium_R NA
MAG19 Bacteria Firmicutes_A Clostridia Oscillospirales Ruminococcaceae Angelakisella NA
MAG41 Bacteria Firmicutes_A Clostridia Oscillospirales Oscillospiraceae NA NA
MAG63 Bacteria Firmicutes_A Clostridia Oscillospirales Oscillospiraceae NA NA
MAG21 Bacteria Firmicutes_A Clostridia Oscillospirales Oscillospiraceae NA NA
MAG59 Bacteria Firmicutes_A Clostridia Oscillospirales Oscillospiraceae NA NA
MAG46 Bacteria Firmicutes_A Clostridia Oscillospirales Oscillospiraceae UBA9475 NA
MAG50 Bacteria Firmicutes_A Clostridia Oscillospirales Oscillospiraceae UBA9475 NA
MAG18 Bacteria Firmicutes_A Clostridia Oscillospirales Oscillospiraceae Lawsonibacter NA
MAG15 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae UBA7182 NA
MAG06 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae NA NA
MAG07 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Dorea NA
MAG36 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Dorea NA
MAG05 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae An181 NA
MAG32 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae 1XD42-69 NA
MAG28 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae 1XD42-69 NA
MAG53 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae 14-2 NA
MAG04 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae NA NA
MAG23 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Kineothrix NA
MAG13 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae NA NA
MAG55 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae 1XD8-76 NA
MAG43 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae CAG-95 NA
MAG31 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Acetatifactor NA
MAG01 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Acetatifactor NA
MAG47 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Acetatifactor NA
MAG52 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Acetatifactor NA
MAG37 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Acetatifactor NA
MAG17 Bacteria Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Acetatifactor NA
MAG22 Bacteria Firmicutes_A Clostridia_A Christensenellales UBA3700 NA NA
MAG27 Bacteria Firmicutes_A Clostridia_A Christensenellales UBA3700 NA NA
MAG29 Bacteria Cyanobacteria Vampirovibrionia Gastranaerophilales Gastranaerophilaceae Zag111 NA
MAG26 Bacteria Proteobacteria Alphaproteobacteria Rs-D84 Rs-D84 Rs-D84 NA
MAG54 Bacteria Desulfobacterota Desulfovibrionia Desulfovibrionales Desulfovibrionaceae Mailhella NA
MAG57 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae CAG-873 NA
MAG25 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae Paramuribaculum NA
MAG61 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae Paramuribaculum NA
MAG60 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae Paramuribaculum NA
MAG09 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae CAG-485 NA
MAG14 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae CAG-485 NA
MAG34 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae CAG-485 NA
MAG62 Bacteria Bacteroidota Bacteroidia Bacteroidales Marinifilaceae Odoribacter NA
# create a short label for each species
Tax <- Tax %>%
  mutate(Labels = ifelse(is.na(species) & is.na(genus), paste0(family, " ", user_genome), species)) %>%
  mutate(Labels = ifelse(is.na(Labels), paste0(genus, " ", user_genome), Labels))

Genome quality

genome_quality <- read_tsv("Example/Results/genome_completeness.tsv") %>%
  mutate(Quality_Score = Completeness - (5*Contamination)) %>%
  mutate(Lineage = sapply(str_split(`Marker lineage`, " "), function(x) x[1])) %>%
  left_join(Tax, by = c("Bin Id" = "user_genome")) %>%
  mutate(Name = Labels) %>%
  select(-Labels)
plt <- ggplot(genome_quality, aes(x = Contamination, y = Completeness, color = phylum)) +
  geom_point() +
  theme_minimal()
ggplotly(plt)

Abundance

Counts <- read_tsv("Example/Results/counts/raw_counts_genomes.tsv")
kable(topleft(Counts, c = 10))
Sample ERR675518 ERR675519 ERR675520 ERR675521 ERR675522 ERR675523 ERR675524 ERR675525 ERR675526
MAG01 17393 281563 26242 331737 23531 269746 23891 173870 44879
MAG02 117204 34570 86293 36247 91372 47348 114237 36185 124830
MAG03 240075 29109 109919 40084 35514 47457 222384 33715 269218
MAG04 24199 726600 18037 747249 73140 290866 13242 339349 29056
MAG05 97526 201572 104119 254551 36218 484727 61292 375456 191088

Mapping rate

mapping_rate <- read_tsv("Example/Results/mapping_rate.tsv",
                         col_names = c("Sample", "Mapping_rate"), skip = 1)  %>%
  mutate(tmp = "samples")

ggplot(mapping_rate, aes(x = tmp, y = Mapping_rate)) +
  geom_jitter() +
  ylim(c(0, 1))+
  theme_minimal()

Stats based on raw counts

There are good reasons to use rawcounts and use centric log ratios, see more in

Gloor, Gregory B., Jean M. Macklaim, Vera Pawlowsky-Glahn, and Juan J. Egozcue. 2017. “Microbiome Datasets Are Compositional: And This Is Not Optional.” Frontiers in Microbiology 8 (November). Frontiers: 2224. doi:10.3389/fmicb.2017.02224.

For differencial abundance analysis see also the same paper.

# transform counts with centered log ratio
data <- Counts %>%
  column_to_rownames(var = "Sample") %>%
  as.data.frame()
data <- transform(data, "clr")
pheatmap(data, cluster_rows = T, cluster_cols = T,
         labels_row = Tax$Labels)

PCA (PCoA) of the robust aitchison distance

pcoa_data <- pcoa(dist(t(data), method = "euclidean"))
pcoa_df <- as.data.frame(pcoa_data$vectors)

plt <- ggplot(pcoa_df, aes(x = Axis.1, y = Axis.2, label = rownames(pcoa_df))) +
  geom_point() +
  labs(x = paste0("PC1 (", round(pcoa_data$values$Relative_eig[1] * 100, digits = 1), "%)"),
       y = paste0("PC2 (", round(pcoa_data$values$Relative_eig[2] * 100, digits = 1), "%)")) +
  theme_minimal()

ggplotly(plt)

Relative abundance

For the relative abundance we take the coverage over the genome not the raw counts. This inmplicit normalizes for genome size. The coverage is calculated as the median of the coverage values calculated in 1kb blocks.

D <- read_tsv("Example/Results/counts/median_coverage_genomes.tsv") %>%
  column_to_rownames(var = "X1") %>%
  as.data.frame()
kable(topleft(D, c= 10))
MAG01 MAG02 MAG03 MAG04 MAG05 MAG06 MAG07 MAG08 MAG09 MAG10
ERR675518 0.00 6.020 6.47 0.24 1.82 0.02 0.16 0.50 104.515 41.08
ERR675519 5.59 1.320 0.00 10.47 3.36 19.18 2.53 41.03 0.000 3.83
ERR675520 0.22 4.950 2.58 0.12 1.34 0.09 0.41 0.00 72.945 34.41
ERR675521 5.82 1.720 0.00 12.07 3.21 21.45 5.16 25.56 1.090 4.34
ERR675522 0.00 4.085 0.36 1.03 0.76 0.17 0.15 0.00 153.425 32.31
# calculate relative abundance
rel_ab <- sweep(D, 1, rowSums(D),`/`)
# get most abundant genomes
counts_per_genome <- data.frame(sums = colSums(rel_ab)) %>%
  rownames_to_column(var = "Sample") %>%
  left_join(Tax, by = c("Sample" = "user_genome")) %>%
  arrange(desc(sums))


ggplot(counts_per_genome %>%
         top_n(sums, n = 10), aes(x = reorder(Labels, sums), y = sums)) +
  geom_col() +
  labs(x = "", y = "Abundance [rel_ab]", title = "Most abundant genomes") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90))

Typical bar chart

level <- 'family'

grouped_data <- rel_ab %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "user_genome") %>%
  pivot_longer(cols = -user_genome, names_to = "Sample", values_to = "rel_ab") %>%
  left_join(Tax, by = "user_genome") %>%
  group_by(Sample, family) %>%
  summarise(summarized_rel_ab = sum(rel_ab))

ggplot(grouped_data, aes(x = Sample, y = summarized_rel_ab, fill = family)) +
  geom_col() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90)) +
  #scale_fill_manual(values=rep(brewer.pal(5,"Paired"), times=5))
  scale_fill_manual(values = c("#771155", "#AA4488", "#CC99BB", "#114477", "#4477AA", "#77AADD", "#117777", "#44AAAA", "#77CCCC", "#117744", "#44AA77", "#88CCAA", "#777711", "#AAAA44", "#DDDD77", "#774411", "#AA7744", "#DDAA77", "#771122", "#AA4455", "#DD7788"))

Functional annotation

Relative abundance of functional annotations per sample

The abundance is calculated as the sum of the relative abundance of all bacteria containing a function.

CAZy – Carbohydrate active enzymes

CAZy_annotations_genome <- read_tsv('Example/Results/annotations/CAZy.tsv')
CAZy_presence <- CAZy_annotations_genome %>%
  column_to_rownames("MAG")
CAZy_presence[CAZy_presence > 0] <- 1
kable(topleft(CAZy_presence, c=10))
AA10 CBM15 CBM20 CBM42 CBM48 CBM50 CBM6 CBM73 CE1 CE10
MAG01 0 0 0 0 1 1 0 0 1 0
MAG02 0 0 0 0 1 0 0 0 0 0
MAG03 0 0 0 0 1 0 0 0 1 0
MAG04 0 0 0 0 1 0 1 0 1 1
MAG05 0 0 0 0 1 0 0 0 0 0
CAZy_rel_ab <- as.matrix(rel_ab) %*% as.matrix(CAZy_presence)
kable(topleft(CAZy_rel_ab, c=10 ))
AA10 CBM15 CBM20 CBM42 CBM48 CBM50 CBM6 CBM73 CE1 CE10
ERR675518 0.0012643 0.0011379 0.3031266 0.1543316 0.9423574 0.1360306 0.0574845 0.0012643 0.2191756 0.0782721
ERR675519 0.0099955 0.0071900 0.0073721 0.0000000 0.8548648 0.2180490 0.0847189 0.0099955 0.2410156 0.1874795
ERR675520 0.0012764 0.0124793 0.3625563 0.2441963 0.9404411 0.1251638 0.1311756 0.0012764 0.1965012 0.0553194
ERR675521 0.0102133 0.0062096 0.0077355 0.0000000 0.8671134 0.1792772 0.1255137 0.0102133 0.3014670 0.2819922
ERR675522 0.0014887 0.0239595 0.1960975 0.0001018 0.9580486 0.1299576 0.1060681 0.0014887 0.2724231 0.1633520
pheatmap(CAZy_rel_ab, show_colnames = F)

KEGG orthologs

Kegg_annotations_genome <- read_tsv('Example/Results/annotations/KO.tsv')
Kegg_presence <- Kegg_annotations_genome %>%
  column_to_rownames("MAG")
Kegg_presence[Kegg_presence > 0] <- 1
kable(topleft(Kegg_presence, c=10 ))
K00001 K00002 K00003 K00004 K00005 K00008 K00009 K00010 K00012 K00013
MAG01 0 1 1 0 0 1 1 0 1 1
MAG02 0 0 0 0 0 0 0 0 1 0
MAG03 1 0 1 0 0 1 0 0 1 1
MAG04 1 1 1 1 0 1 1 1 1 1
MAG05 0 0 1 1 0 0 0 0 1 1
Kegg_rel_ab <- as.matrix(rel_ab) %*% as.matrix(Kegg_presence)
kable(topleft(Kegg_rel_ab, c=10 ))
K00001 K00002 K00003 K00004 K00005 K00008 K00009 K00010 K00012 K00013
ERR675518 0.4400105 0.0101831 0.9155433 0.0134598 0.0255867 0.7010194 0.2466482 0.0038035 0.7604636 0.6394890
ERR675519 0.7135613 0.1308889 0.9290963 0.0871965 0.0965058 0.7166948 0.2390116 0.0343831 0.6378603 0.7783985
ERR675520 0.3676506 0.0152943 0.8906205 0.0129238 0.0017437 0.7079264 0.2520429 0.0129466 0.8497521 0.6398199
ERR675521 0.5975252 0.1785444 0.9232943 0.1936529 0.0720826 0.7412069 0.3813456 0.0508551 0.6311794 0.7058832
ERR675522 0.4795587 0.0106501 0.9051609 0.0148999 0.0437836 0.6409640 0.3285809 0.0146073 0.6824891 0.5536766
pheatmap(Kegg_rel_ab, show_colnames = F)